import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotnine as gg
#matplotlib.style.use('ggplot')
matplotlib.style.use('seaborn')
matplotlib.rcParams['figure.figsize'] = (12,8)
%matplotlib inline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve,auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
import datetime
import time
# calculate the score of model
def score(fpr,tpr):
score=0.4*tpr[np.where(fpr>=0.001)[0][0]]+0.3*tpr[np.where(fpr>=0.005)[0][0]]+0.3*tpr[np.where(fpr>=0.01)[0][0]]
return score
# draw roc and pr curve
def roc_and_pr(y_te, y_score):
fpr, tpr, threshold = roc_curve(y_te, y_score)
roc_auc = auc(fpr, tpr)
lw = 2
plt.figure(figsize = (15,8))
plt.subplot(121)
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
precision, recal, thresholds = precision_recall_curve(y_te, y_score)
average_precision = average_precision_score(y_te, y_score)
plt.subplot(122)
plt.step(recal, precision, color='b', alpha=0.2,where='post')
plt.fill_between(recal, precision, step='post', alpha=0.2,color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('PR curve: AP={0:0.3f}'.format(average_precision))
data = pd.read_csv('atec_anti_fraud_train.csv')
data.head()
data.shape
date_df = data.iloc[:,1:3]
date_df.head()
def strptime(date):
t = time.strptime(str(date), "%Y%m%d")
return(t)
data_strptime = date_df.date.apply(strptime)
date_df['year'] = [data_strptime[i][0] for i in range(len(data_strptime))]
date_df['month'] = [data_strptime[i][1] for i in range(len(data_strptime))]
date_df['day'] = [data_strptime[i][2] for i in range(len(data_strptime))]
date_df['weekday'] = [data_strptime[i][6] for i in range(len(data_strptime))]
date_df['strptime'] = data_strptime
date_df.head()
date_df = date_df.sort_values(by='strptime')
date_df.head()
date_df.tail()
* The data is from 2017-09-05 to 2017-11-05.
train data: date between [ 20170905, 20171020]
test data: date between [ 20171021, 20171105]
Almost 3 : 1.
Store as csv
thres = time.strptime(str(20171020), "%Y%m%d")
train1020_with_unlabel = data[date_df['strptime'] <=thres]
test1020_with_unlabel = data[date_df['strptime'] >thres]
data_without_unlabel = data[data['label'] != -1]
date_ = date_df[date_df['label'] != -1 ]
train1020_without_unlabel = data_without_unlabel[date_['strptime'] <=thres]
test1020_without_unlabel = data_without_unlabel[date_['strptime'] >thres]
train1020_with_unlabel.to_csv('/Users/lijh/Downloads/ATEC/train1020_with_unlabel.csv',index=False)
test1020_with_unlabel.to_csv('/Users/lijh/Downloads/ATEC/test1020_with_unlabel.csv',index=False)
train1020_without_unlabel.to_csv('/Users/lijh/Downloads/ATEC/train1020_only_label.csv',index=False)
test1020_without_unlabel.to_csv('/Users/lijh/Downloads/ATEC/test1020_only_label.csv',index=False)
label_0 = date_df[date_df['label'] == 0]
label_1 = date_df[date_df['label'] == 1]
unlabel = date_df[date_df['label'] == -1]
plt.rcParams['figure.figsize'] = (20,15)
plt.subplot(221)
date_df.weekday.hist(bins = 13, density = True)
plt.title('label = 0', fontsize = 30)
plt.subplot(222)
label_0.weekday.hist(bins = 13, density = True)
plt.title('label = 0', fontsize = 30)
plt.subplot(223)
label_1.weekday.hist(bins = 13, density = True)
plt.title('label = 1', fontsize = 30)
plt.xlabel('weekday',fontsize = 20)
plt.subplot(224)
unlabel.weekday.hist(bins = 13, density =True)
plt.title('label = -1', fontsize = 30)
plt.xlabel('weekday',fontsize = 20)
label0_weekday = []
label1_weekday = []
unlabel_weekday = []
for i in range(7):
weekday_ = date_df[date_df['weekday']==i]
num = len(weekday_)
label0_weekday.append( len(weekday_[weekday_['label'] == 0]) / num )
label1_weekday.append( len(weekday_[weekday_['label'] == 1]) / num )
unlabel_weekday.append( len(weekday_[weekday_['label'] == -1]) / num )
each_weekday = pd.DataFrame({'label 0': label0_weekday, 'label 1': label1_weekday,
'unlabel': unlabel_weekday})
cf.go_offline()
each_weekday[['label 1','unlabel']].iplot(kind='spread', title='The rate of label = 1 and unlabel in each weekday',yTitle = 'rate')
* Friday, Saturday and Sunday are more likely to having risk.
import pygal
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
from plotly import __version__
import cufflinks as cf
day = date_df['date'].value_counts().index.values
day.sort()
day
label0_day = []
label1_day = []
unlabel_day = []
for i in day:
day_ = date_df[date_df['date']==i]
num = len(day_)
label0_day.append( len(day_[day_['label'] == 0]) / num )
label1_day.append( len(day_[day_['label'] == 1]) / num )
unlabel_day.append( len(day_[day_['label'] == -1]) / num )
each_day = pd.DataFrame({'day': day, 'label 0': label0_day, 'label 1': label1_day,
'unlabel': unlabel_day})
ind = []
for i in range(26):
ind.append( 'Sep ' + str(i+5) )
for i in range(31):
ind.append( 'Oct ' + str(i+1) )
for i in range(5):
ind.append( 'Nov ' + str(i+1) )
each_day.index = ind
cf.go_offline()
each_day[['label 1','unlabel']].iplot(kind='spread', title='The rate of label = 1 and unlabel in each day',yTitle = 'rate')
X_raw = data.iloc[:,3:]
y_raw = data.iloc[:,1]
pca = PCA(n_components = 2)
X_minus = X_raw.copy()
X_minus[X_minus.isna()] = -1
X_label_1 = X_minus[y_raw != -1]
y_label = y_raw[y_raw != -1]
rnd_clf_9 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7,
class_weight = 'balanced')
rnd_clf_9.fit(X_label_1, y_label)
df_9 = pd.DataFrame({'feature': X_label_1.columns,
'importance': rnd_clf_9.feature_importances_ })
df_9 = df_9.sort_values(by='importance', ascending =False)
df_9_ = df_9[df_9['importance'] != 0 ]
print(str(len(df_9_)) + ' features importance != 0')
sns.set(rc={'figure.figsize':(10,10)})
sns.set(font_scale=2) # set the font size
ax = sns.barplot(y = 'feature', x = 'importance',data=df_9[df_9['importance'] >= 0.01 ]) #draw the barplot
ax.set(title = " ", xlabel = "feature", ylabel = "importance")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90, fontsize = 0.01)
ax.tick_params(labelsize=20)
X = X_raw.copy()
imputer = Imputer(strategy = 'median')
imputer.fit(X)
X_norm = imputer.transform(X)
X = pd.DataFrame(X_norm, columns = X.columns)
X_label = X[y_raw != -1]
X_label.shape
y_label = y_raw[y_raw != -1]
y_label.shape

rnd_clf_1 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7,
class_weight = 'balanced_subsample')
rnd_clf_1.fit(X_label, y_label)
df_1 = pd.DataFrame({'feature': X_label.columns,
'importance': rnd_clf_1.feature_importances_ })
df_1 = df_1.sort_values(by='importance', ascending =False)
df_1_ = df_1[df_1['importance'] != 0 ]
print(str(len(df_1_)) + ' features importance != 0')
sns.set(rc={'figure.figsize':(10,10)})
sns.set(font_scale=2) # set the font size
ax = sns.barplot(y = 'feature', x = 'importance',data=df_1[df_1['importance'] >= 0.01 ]) #draw the barplot
ax.set(title = " ", xlabel = "feature", ylabel = "importance")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90, fontsize = 0.01)
ax.tick_params(labelsize=20)
rnd_clf_2 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7,
class_weight = 'balanced')
rnd_clf_2.fit(X_label, y_label)
df_2 = pd.DataFrame({'feature': X_label.columns,
'importance': rnd_clf_2.feature_importances_ })
df_2 = df_2.sort_values(by='importance', ascending =False)
df_2_ = df_2[df_2['importance'] != 0 ]
print(str(len(df_2_)) + ' features importance != 0')
sns.set(rc={'figure.figsize':(10,10)})
sns.set(font_scale=2) # set the font size
ax = sns.barplot(y = 'feature', x = 'importance', data=df_2[df_2['importance'] >= 0.01] ) #draw the barplot
ax.set(title = " ", xlabel = "feature", ylabel = "importance")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90, fontsize = 0.01)
ax.tick_params(labelsize=20)
t = pd.concat([df_1_, df_2_], axis = 1)
diff = []
for i in range(len(t)):
if True in t.isnull().values[i]:
diff.append(i)
t.iloc[diff,:]

num1 = sum(y_label == 1)
num0 = sum(y_label == 0)
print("label=1: " + str(num1) + "\nlabel=0: " + str(num0)+
"\nratio: " + str( num0 / num1 ))
weight_1 = np.ones(len(y_label))
weight_1[y_label == 0] = 1 / ( num1*80 + num0 )
weight_1[y_label == 1] = 80 / ( num1*80 + num0 )
sum(weight_1)
rnd_clf_3 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7)
rnd_clf_3.fit(X_label, y_label, sample_weight = weight_1)
df_3 = pd.DataFrame({'feature': X_label.columns,
'importance': rnd_clf_3.feature_importances_ })
df_3 = df_3.sort_values(by='importance', ascending =False)
df_3_ = df_3[df_3['importance'] != 0 ]
print(str(len(df_3_)) + ' features importance != 0')
sns.set(rc={'figure.figsize':(10,10)})
sns.set(font_scale=2) # set the font size
ax = sns.barplot(y = 'feature', x = 'importance', data=df_3[df_3['importance'] >= 0.01] ) #draw the barplot
ax.set(title = " ", xlabel = "feature", ylabel = "importance")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90, fontsize = 0.01)
ax.tick_params(labelsize=20)
t2 = pd.concat([df_1_, df_2_, df_3_], axis = 1)
diff = []
for i in range(len(t)):
if True in t2.isnull().values[i]:
diff.append(i)
t2.iloc[diff,:]
weight_2 = np.ones(len(y_label))
weight_2[y_label == 0] = 1 / ( num1*30 + num0 )
weight_2[y_label == 1] = 30 / ( num1*30 + num0 )
sum(weight_2)
rnd_clf_4 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7)
rnd_clf_4.fit(X_label, y_label, sample_weight = weight_2)
df_4 = pd.DataFrame({'feature': X_label.columns,
'importance': rnd_clf_4.feature_importances_ })
df_4 = df_4.sort_values(by='importance', ascending =False)
df_4_ = df_4[df_4['importance'] != 0 ]
print(str(len(df_4_)) + ' features importance != 0')
sns.set(rc={'figure.figsize':(10,10)})
sns.set(font_scale=2) # set the font size
ax = sns.barplot(y = 'feature', x = 'importance', data=df_4[df_4['importance'] >= 0.01] ) #draw the barplot
ax.set(title = " ", xlabel = "feature", ylabel = "importance")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90, fontsize = 0.01)
ax.tick_params(labelsize=20)
t3 = pd.concat([df_1_, df_2_, df_3_, df_4_], axis = 1)
diff = []
for i in range(len(t)):
if True in t3.isnull().values[i]:
diff.append(i)
t3.iloc[diff,:].head()
print("There are " + str(len(t3.iloc[diff,:])) + " differences.")
pd.concat([df_1, df_2, df_3, df_4], axis = 1).loc[35:46,:]
y_tr, y_te = train_test_split(y_label, test_size = 0.2, random_state =42)
print(len(y_tr), "train+", len(y_te), 'test')
X_tr = X.loc[y_tr.index,:]
X_te = X.loc[y_te.index,:]
rnd_clf_5 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7,
class_weight = 'balanced')
rnd_clf_5.fit(X_tr, y_tr)
y_pred_1 = rnd_clf_5.predict(X_te)
confusion_matrix(y_te, y_pred_1)
y_pred_prob_1 = rnd_clf_5.predict_proba(X_te)
roc_and_pr(y_te, y_pred_prob_1[:,1])
ind_unlabel = y_raw == -1
X_unlabel = X[ind_unlabel]
X_unlabel.shape
X_2 = pd.concat([X_tr, X_unlabel])
y_unlabel_1 = pd.DataFrame(np.ones(sum(ind_unlabel)))
y_2 = pd.concat([y_tr, y_unlabel_1]).iloc[:,0]
rnd_clf_6 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7,
class_weight = 'balanced')
rnd_clf_6.fit(X_2, y_2)
y_pred_2 = rnd_clf_6.predict(X_te)
confusion_matrix(y_te, y_pred_2)
y_pred_prob_2 = rnd_clf_6.predict_proba(X_te)
roc_and_pr(y_te, y_pred_prob_2[:,1])
X_tr_1020 = train1020_without_unlabel.iloc[:,3:]
y_tr_1020 = train1020_without_unlabel.iloc[:,1]
X_te_1020 = test1020_without_unlabel.iloc[:,3:]
y_te_1020 = test1020_without_unlabel.iloc[:,1]
X_tr_1020 = X_tr_1020.fillna(-1)
X_te_1020 = X_te_1020.fillna(-1)
rnd_clf_7 = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7,
class_weight = 'balanced')
rnd_clf_7.fit(X_tr_1020, y_tr_1020)
y_pred_3 = rnd_clf_7.predict(X_te_1020)
confusion_matrix(y_te_1020, y_pred_3)
y_pred_prob_3 = rnd_clf_7.predict_proba(X_te_1020)
roc_and_pr(y_te_1020, y_pred_prob_3[:,1])
score(fpr3, tpr3)
# data_without_unlabel = data[data['label'] != -1]
# date_ = date_df[date_df['label'] != -1 ]
thres1 = time.strptime(str(20170920), "%Y%m%d")
thres2 = time.strptime(str(20171005), "%Y%m%d")
thres3 = time.strptime(str(20171020), "%Y%m%d")
train11 = data_without_unlabel[date_['strptime'] <=thres1]
test11 = data_without_unlabel[ date_['strptime'] > thres3 ]
train22 = data_without_unlabel[(date_['strptime'] >thres1) & (date_['strptime'] <= thres2)]
train33 = data_without_unlabel[(date_['strptime'] >thres2) & (date_['strptime'] <= thres3)]
X_tr_11 = train11.iloc[:,3:]
y_tr_11 = train11.iloc[:,1]
X_te_11 = test11.iloc[:,3:]
y_te_11 = test11.iloc[:,1]
X_tr_11 = X_tr_11.fillna(-1)
X_te_11 = X_te_11.fillna(-1)
rnd_clf_8 = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7,
class_weight = 'balanced')
rnd_clf_8.fit(X_tr_11, y_tr_11)
y_pred_4 = rnd_clf_8.predict(X_te_11)
confusion_matrix(y_te_11, y_pred_4)
y_pred_prob_4= rnd_clf_8.predict_proba(X_te_11)
roc_and_pr(y_te_11, y_pred_prob_4[:,1])
score(fpr4, tpr4)
X_tr_22 = train22.iloc[:,3:]
y_tr_22 = train22.iloc[:,1]
X_tr_22 = X_tr_22.fillna(-1)
rnd_clf_10 = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7,
class_weight = 'balanced')
rnd_clf_10.fit(X_tr_22, y_tr_22)
y_pred_5 = rnd_clf_10.predict(X_te_11)
confusion_matrix(y_te_11, y_pred_5)
y_pred_prob_5= rnd_clf_10.predict_proba(X_te_11)
roc_and_pr(y_te_11, y_pred_prob_5[:,1])
score(fpr5, tpr5)
X_tr_33 = train33.iloc[:,3:]
y_tr_33 = train33.iloc[:,1]
X_tr_33 = X_tr_33.fillna(-1)
rnd_clf_11 = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7,
class_weight = 'balanced')
rnd_clf_11.fit(X_tr_33, y_tr_33)
y_pred_6 = rnd_clf_11.predict(X_te_11)
confusion_matrix(y_te_11, y_pred_6)
y_pred_prob_6= rnd_clf_11.predict_proba(X_te_11)
roc_and_pr(y_te_11, y_pred_prob_6[:,1])
score(fpr6,tpr6)
Using the train and the data in 2.3.2, that is, split the train and test set by the date, train : test = 3:1
# rnd_clf_7 = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7,
# class_weight = 'balanced')
# rnd_clf_7.fit(X_tr_1020, y_tr_1020)
# y_pred_3 = rnd_clf_7.predict(X_te_1020)
confusion_matrix(y_te_1020, y_pred_3)
y_pred_prob_3 = rnd_clf_7.predict_proba(X_te_1020)
roc_and_pr(y_te_1020, y_pred_prob_3[:,1])
def jiangshan( y_score ):
ind_add = y_score > 0.9
X_add = pd.concat([X_tr_1020, X_te_1020[ind_add]])
print(len(X_add_1) - len(X_tr_1020) )
y_add = pd.concat([y_tr_1020, pd.DataFrame(np.ones(sum(ind_add))) ]).iloc[:,0]
rnd_clf_js = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7,
class_weight = 'balanced')
rnd_clf_js.fit(X_add, y_add)
y_pred_js = rnd_clf_js.predict(X_te_1020)
print( confusion_matrix(y_te_1020, y_pred_js) )
y_pred_prob_js = rnd_clf_12.predict_proba(X_te_1020)
roc_and_pr(y_te_1020, y_pred_prob_js[:,1])
return(y_pred_prob_js)
jiangshan(y_pred_prob_7[:,1])
test = pd.read_csv('atec_anti_fraud_test_a.csv')
X_test = test.iloc[:,2:]
X_test_ = X_test.copy()
X_test_norm = imputer.transform(X_test_)
X_test_ = pd.DataFrame(X_test_norm, columns = X_test_.columns)
y_test_prob = rnd_clf_1.predict_proba(X_test_)
final = pd.DataFrame({'id':test.iloc[:,0], 'score': score})
final.head()
final.to_csv('/Users/lijh/Downloads/ATEC/final.csv',index=False)
y_3 = y_raw.copy()
y_3[ind_unlabel] = 1
rnd_clf_7 = RandomForestClassifier(n_estimators=100, max_leaf_nodes=20, n_jobs = -1, random_state = 7,
class_weight = 'balanced')
rnd_clf_7.fit(X, y_3)
y_test_prob_2 = rnd_clf_7.predict_proba(X_test_)
score_2 = y_test_prob_2[:,1]
final_2 = pd.DataFrame({'id':test.iloc[:,0], 'score': score_2})
final_2.head()
final_2.to_csv('/Users/lijh/Downloads/ATEC/final2.0.csv',index=False)
final_3 = pd.DataFrame({'id':test.iloc[:,0]})
final_3['score'] = 1
final_3.head()
final_3.to_csv('/Users/lijh/Downloads/ATEC/final3.0.csv',index=False)
final_4 = pd.DataFrame({'id':test.iloc[:,0]})
final_4['score'] = 0
final_4.head()
final_4.to_csv('/Users/lijh/Downloads/ATEC/final4.0.csv',index=False)
y_5 = y_raw.copy()
y_5[ind_unlabel] = 1
X_5 = X_raw.copy()
X_5 = X_5.fillna(-1)
X_test_5 = X_test.copy()
X_test_5 = X_test_5.fillna(-1)
rnd_clf_20 = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7,
class_weight = 'balanced')
rnd_clf_20.fit(X_5, y_5)
y_test_prob_5 = rnd_clf_20.predict_proba(X_test_5)
score_5 = y_test_prob_5[:,1]
def jiangshan2( y_score ):
ind_add = y_score > 0.7
X_add = pd.concat([X_5, X_test_5[ind_add]])
print(len(X_add) - len(X_5) )
y_add = pd.concat([y_5, pd.DataFrame(np.ones(sum(ind_add))) ]).iloc[:,0]
rnd_clf_js = RandomForestClassifier(n_estimators=100, n_jobs = -1, random_state = 7,
class_weight = 'balanced')
rnd_clf_js.fit(X_add, y_add)
y_pred_prob_js = rnd_clf_js.predict_proba(X_test_5)
return(y_pred_prob_js)
score_5 = jiangshan2(score_5)[:,1]
score_5 = jiangshan2(score_5)[:,1]
score_5 = jiangshan2(score_5)[:,1]
score_5 = jiangshan2(score_5)[:,1]
score_5 = jiangshan2(score_5)[:,1]
count = 1
while count < 20:
score_5 = jiangshan2(score_5)[:,1]
count = count + 1
test_strptime = test.date.apply(strptime)
test_df = test.copy()
test_df['strptime'] = test_strptime
test_df = test_df.sort_values(by='strptime')
y_label = y_raw[y_raw != -1]
X_tr_part = X_tr.iloc[0:40000,:]
y_tr_part = y_tr[0:40000]
X_tr_part_imp = X_tr_part.iloc[:,ind]
y_unlabel_1 = y_unlabel.copy()
y_unlabel_2 = rnd_clf.predict(X_unlabel)
y_unlabel_2 = pd.DataFrame(y_unlabel_2)
y2 = pd.concat([y_tr_part, y_unlabel_2]).iloc[:,0]
rnd_clf.fit(X2, y2)
y_pred_rf = rnd_clf.predict(X_te)
confusion_matrix(y_te, y_pred_rf)
y_unlabel2 = rnd_clf_1.predict(X_unlabel)
y_unlabel2 = pd.DataFrame(y_unlabel2)
y_tr_2 = pd.concat([y_tr, y_unlabel2])
y_tr_2 = y_tr_2.iloc[:,0]
rnd_clf_2 = RandomForestClassifier(n_estimators=200, max_leaf_nodes=20, n_jobs = -1)
rnd_clf_2.fit(X_tr_add_unlabel, y_tr_2)
y_pred_rf_2 = rnd_clf_2.predict(X_te)
confusion_matrix(y_te, y_pred_rf_2)
y_unlabel = rnd_clf.predict(X_unlabel)
# 合并 label unlabel
X_tr_3
y_tr_3
rnd_clf_3 = RandomForestClassifier(n_estimators=500, max_leaf_nodes=20, n_jobs = -1)
rnd_clf_3.fit(X_tr_3, y_tr_3)
y_pred_rf_3 = rnd_clf_3.predict(X_te)
confusion_matrix(y_te, y_pred_rf_3)
rnd_clf_2.fit(X_tr_add_unlabel, y_tr_2)
sklearn.cluster.KMeans(n_clusters=8, random_state=7, n_jobs= -1 )
a = [1, None, 2]
sum(a,skipna = True)